{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setting up the Knowledge Graph Datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "\n",
    "from aips import get_engine\n",
    "from aips.spark.dataframe import from_csv\n",
    "\n",
    "spark = SparkSession.builder.appName(\"AIPS\").getOrCreate()\n",
    "engine = get_engine()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "## Download the Datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cloning into 'jobs'...\n",
      "remote: Enumerating objects: 4, done.\u001b[K\n",
      "remote: Counting objects: 100% (4/4), done.\u001b[K\n",
      "remote: Compressing objects: 100% (4/4), done.\u001b[K\n",
      "remote: Total 4 (delta 0), reused 4 (delta 0), pack-reused 0 (from 0)\u001b[K\n",
      "Receiving objects: 100% (4/4), 47.24 MiB | 24.89 MiB/s, done.\n",
      "Already up to date.\n",
      "._jobs.csv\n",
      "jobs.csv\n",
      "Cloning into 'health'...\n",
      "remote: Enumerating objects: 4, done.\u001b[K\n",
      "remote: Counting objects: 100% (4/4), done.\u001b[K\n",
      "remote: Compressing objects: 100% (3/3), done.\u001b[K\n",
      "remote: Total 4 (delta 0), reused 3 (delta 0), pack-reused 0 (from 0)\u001b[K\n",
      "Receiving objects: 100% (4/4), 6.57 MiB | 21.35 MiB/s, done.\n",
      "Already up to date.\n",
      "._posts.csv\n",
      "posts.csv\n",
      "Cloning into 'scifi'...\n",
      "remote: Enumerating objects: 4, done.\u001b[K\n",
      "remote: Counting objects: 100% (4/4), done.\u001b[K\n",
      "remote: Compressing objects: 100% (3/3), done.\u001b[K\n",
      "remote: Total 4 (delta 0), reused 4 (delta 0), pack-reused 0 (from 0)\u001b[K\n",
      "Receiving objects: 100% (4/4), 85.09 MiB | 26.01 MiB/s, done.\n",
      "Already up to date.\n",
      "._posts.csv\n",
      "posts.csv\n",
      "Cloning into 'cooking'...\n",
      "remote: Enumerating objects: 4, done.\u001b[K\n",
      "remote: Counting objects: 100% (4/4), done.\u001b[K\n",
      "remote: Compressing objects: 100% (3/3), done.\u001b[K\n",
      "remote: Total 4 (delta 0), reused 3 (delta 0), pack-reused 0 (from 0)\u001b[K\n",
      "Receiving objects: 100% (4/4), 24.48 MiB | 25.53 MiB/s, done.\n",
      "Already up to date.\n",
      "._posts.csv\n",
      "posts.csv\n",
      "Cloning into 'travel'...\n",
      "remote: Enumerating objects: 4, done.\u001b[K\n",
      "remote: Counting objects: 100% (4/4), done.\u001b[K\n",
      "remote: Compressing objects: 100% (3/3), done.\u001b[K\n",
      "remote: Total 4 (delta 0), reused 3 (delta 0), pack-reused 0 (from 0)\u001b[K\n",
      "Receiving objects: 100% (4/4), 40.98 MiB | 25.76 MiB/s, done.\n",
      "Already up to date.\n",
      "._posts.csv\n",
      "posts.csv\n",
      "Cloning into 'devops'...\n",
      "remote: Enumerating objects: 4, done.\u001b[K\n",
      "remote: Counting objects: 100% (4/4), done.\u001b[K\n",
      "remote: Compressing objects: 100% (3/3), done.\u001b[K\n",
      "remote: Total 4 (delta 0), reused 3 (delta 0), pack-reused 0 (from 0)\u001b[K\n",
      "Receiving objects: 100% (4/4), 3.81 MiB | 20.67 MiB/s, done.\n",
      "Already up to date.\n",
      "._posts.csv\n",
      "posts.csv\n"
     ]
    }
   ],
   "source": [
    "#jobs\n",
    "![ ! -d 'jobs' ] && git clone --depth=1 https://github.com/ai-powered-search/jobs.git\n",
    "! cd jobs && git pull\n",
    "! cd jobs && mkdir -p '../data/jobs/' && tar -xvf jobs.tgz -C '../data/jobs/'    \n",
    "\n",
    "#health\n",
    "![ ! -d 'health' ] && git clone --depth=1 https://github.com/ai-powered-search/health.git\n",
    "! cd health && git pull\n",
    "! cd health && mkdir -p '../data/health/' && tar -xvf health.tgz -C '../data/health/'\n",
    "\n",
    "#scifi\n",
    "![ ! -d 'scifi' ] && git clone --depth=1 https://github.com/ai-powered-search/scifi.git\n",
    "! cd scifi && git pull\n",
    "! cd scifi && mkdir -p '../data/scifi/' && tar -xvf scifi.tgz -C '../data/scifi/' \n",
    "\n",
    "#cooking\n",
    "![ ! -d 'cooking' ] && git clone --depth=1 https://github.com/ai-powered-search/cooking.git\n",
    "! cd cooking && git pull\n",
    "! cd cooking && mkdir -p '../data/cooking/' && tar -xvf cooking.tgz -C '../data/cooking/'\n",
    "\n",
    "#travel\n",
    "![ ! -d 'travel' ] && git clone --depth=1 https://github.com/ai-powered-search/travel.git\n",
    "! cd travel && git pull\n",
    "! cd travel && mkdir -p '../data/travel/' && tar -xvf travel.tgz -C '../data/travel/'\n",
    "\n",
    "#devops\n",
    "![ ! -d 'devops' ] && git clone --depth=1 https://github.com/ai-powered-search/devops.git\n",
    "! cd devops && git pull\n",
    "! cd devops && mkdir -p '../data/devops/' && tar -xvf devops.tgz -C '../data/devops/'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Index the Jobs Dataset into the Search Engine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wiping \"jobs\" collection\n",
      "Creating \"jobs\" collection\n",
      "Status: Success\n",
      "Loading data/jobs/jobs.csv\n",
      "Schema: \n",
      "root\n",
      " |-- job_title: string (nullable = true)\n",
      " |-- job_description: string (nullable = true)\n",
      " |-- job_type: string (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      " |-- job_location: string (nullable = true)\n",
      " |-- job_city: string (nullable = true)\n",
      " |-- job_state: string (nullable = true)\n",
      " |-- job_country: string (nullable = true)\n",
      " |-- job_zip_code: string (nullable = true)\n",
      " |-- job_address: string (nullable = true)\n",
      " |-- min_salary: string (nullable = true)\n",
      " |-- max_salary: string (nullable = true)\n",
      " |-- salary_period: string (nullable = true)\n",
      " |-- apply_url: string (nullable = true)\n",
      " |-- apply_email: string (nullable = true)\n",
      " |-- num_employees: string (nullable = true)\n",
      " |-- industry: string (nullable = true)\n",
      " |-- company_name: string (nullable = true)\n",
      " |-- company_email: string (nullable = true)\n",
      " |-- company_website: string (nullable = true)\n",
      " |-- company_phone: string (nullable = true)\n",
      " |-- company_logo: string (nullable = true)\n",
      " |-- company_description: string (nullable = true)\n",
      " |-- company_location: string (nullable = true)\n",
      " |-- company_city: string (nullable = true)\n",
      " |-- company_state: string (nullable = true)\n",
      " |-- company_country: string (nullable = true)\n",
      " |-- company_zip_code: string (nullable = true)\n",
      " |-- job_date: timestamp (nullable = true)\n",
      "\n",
      "Successfully written 30002 documents\n"
     ]
    }
   ],
   "source": [
    "jobs_collection = engine.create_collection(\"jobs\")\n",
    "jobs_collection.write(from_csv(\"data/jobs/jobs.csv\", {\"category\": jobs_collection.name}))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Index StackExchange datasets: health, scifi, cooking, travel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wiping \"stackexchange\" collection\n",
      "Creating \"stackexchange\" collection\n",
      "Status: Success\n",
      "Loading data/health/posts.csv\n",
      "Schema: \n",
      "root\n",
      " |-- post_type_id: integer (nullable = true)\n",
      " |-- accepted_answer_id: integer (nullable = true)\n",
      " |-- parent_id: integer (nullable = true)\n",
      " |-- creation_date: timestamp (nullable = true)\n",
      " |-- deletion_date: string (nullable = true)\n",
      " |-- score: integer (nullable = true)\n",
      " |-- view_count: integer (nullable = true)\n",
      " |-- body: string (nullable = true)\n",
      " |-- owner_user_id: integer (nullable = true)\n",
      " |-- owner_display_name: string (nullable = true)\n",
      " |-- last_editor_user_id: integer (nullable = true)\n",
      " |-- last_editor_display_name: string (nullable = true)\n",
      " |-- last_edit_date: timestamp (nullable = true)\n",
      " |-- last_activity_date: timestamp (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- tags: string (nullable = true)\n",
      " |-- answer_count: integer (nullable = true)\n",
      " |-- comment_count: integer (nullable = true)\n",
      " |-- favorite_count: integer (nullable = true)\n",
      " |-- closed_date: timestamp (nullable = true)\n",
      " |-- community_owned_date: timestamp (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      "\n",
      "Successfully written 12892 documents\n",
      "Wiping \"health\" collection\n",
      "Creating \"health\" collection\n",
      "Status: Success\n",
      "Loading data/health/posts.csv\n",
      "Schema: \n",
      "root\n",
      " |-- post_type_id: integer (nullable = true)\n",
      " |-- accepted_answer_id: integer (nullable = true)\n",
      " |-- parent_id: integer (nullable = true)\n",
      " |-- creation_date: timestamp (nullable = true)\n",
      " |-- deletion_date: string (nullable = true)\n",
      " |-- score: integer (nullable = true)\n",
      " |-- view_count: integer (nullable = true)\n",
      " |-- body: string (nullable = true)\n",
      " |-- owner_user_id: integer (nullable = true)\n",
      " |-- owner_display_name: string (nullable = true)\n",
      " |-- last_editor_user_id: integer (nullable = true)\n",
      " |-- last_editor_display_name: string (nullable = true)\n",
      " |-- last_edit_date: timestamp (nullable = true)\n",
      " |-- last_activity_date: timestamp (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- tags: string (nullable = true)\n",
      " |-- answer_count: integer (nullable = true)\n",
      " |-- comment_count: integer (nullable = true)\n",
      " |-- favorite_count: integer (nullable = true)\n",
      " |-- closed_date: timestamp (nullable = true)\n",
      " |-- community_owned_date: timestamp (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      "\n",
      "Successfully written 12892 documents\n",
      "Loading data/cooking/posts.csv\n",
      "Schema: \n",
      "root\n",
      " |-- post_type_id: integer (nullable = true)\n",
      " |-- accepted_answer_id: integer (nullable = true)\n",
      " |-- parent_id: integer (nullable = true)\n",
      " |-- creation_date: timestamp (nullable = true)\n",
      " |-- deletion_date: string (nullable = true)\n",
      " |-- score: integer (nullable = true)\n",
      " |-- view_count: integer (nullable = true)\n",
      " |-- body: string (nullable = true)\n",
      " |-- owner_user_id: integer (nullable = true)\n",
      " |-- owner_display_name: string (nullable = true)\n",
      " |-- last_editor_user_id: integer (nullable = true)\n",
      " |-- last_editor_display_name: string (nullable = true)\n",
      " |-- last_edit_date: timestamp (nullable = true)\n",
      " |-- last_activity_date: timestamp (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- tags: string (nullable = true)\n",
      " |-- answer_count: integer (nullable = true)\n",
      " |-- comment_count: integer (nullable = true)\n",
      " |-- favorite_count: integer (nullable = true)\n",
      " |-- closed_date: timestamp (nullable = true)\n",
      " |-- community_owned_date: timestamp (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      "\n",
      "Successfully written 79324 documents\n",
      "Wiping \"cooking\" collection\n",
      "Creating \"cooking\" collection\n",
      "Status: Success\n",
      "Loading data/cooking/posts.csv\n",
      "Schema: \n",
      "root\n",
      " |-- post_type_id: integer (nullable = true)\n",
      " |-- accepted_answer_id: integer (nullable = true)\n",
      " |-- parent_id: integer (nullable = true)\n",
      " |-- creation_date: timestamp (nullable = true)\n",
      " |-- deletion_date: string (nullable = true)\n",
      " |-- score: integer (nullable = true)\n",
      " |-- view_count: integer (nullable = true)\n",
      " |-- body: string (nullable = true)\n",
      " |-- owner_user_id: integer (nullable = true)\n",
      " |-- owner_display_name: string (nullable = true)\n",
      " |-- last_editor_user_id: integer (nullable = true)\n",
      " |-- last_editor_display_name: string (nullable = true)\n",
      " |-- last_edit_date: timestamp (nullable = true)\n",
      " |-- last_activity_date: timestamp (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- tags: string (nullable = true)\n",
      " |-- answer_count: integer (nullable = true)\n",
      " |-- comment_count: integer (nullable = true)\n",
      " |-- favorite_count: integer (nullable = true)\n",
      " |-- closed_date: timestamp (nullable = true)\n",
      " |-- community_owned_date: timestamp (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      "\n",
      "Successfully written 79324 documents\n",
      "Loading data/scifi/posts.csv\n",
      "Schema: \n",
      "root\n",
      " |-- post_type_id: integer (nullable = true)\n",
      " |-- accepted_answer_id: integer (nullable = true)\n",
      " |-- parent_id: integer (nullable = true)\n",
      " |-- creation_date: timestamp (nullable = true)\n",
      " |-- deletion_date: string (nullable = true)\n",
      " |-- score: integer (nullable = true)\n",
      " |-- view_count: integer (nullable = true)\n",
      " |-- body: string (nullable = true)\n",
      " |-- owner_user_id: integer (nullable = true)\n",
      " |-- owner_display_name: string (nullable = true)\n",
      " |-- last_editor_user_id: integer (nullable = true)\n",
      " |-- last_editor_display_name: string (nullable = true)\n",
      " |-- last_edit_date: timestamp (nullable = true)\n",
      " |-- last_activity_date: timestamp (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- tags: string (nullable = true)\n",
      " |-- answer_count: integer (nullable = true)\n",
      " |-- comment_count: integer (nullable = true)\n",
      " |-- favorite_count: integer (nullable = true)\n",
      " |-- closed_date: timestamp (nullable = true)\n",
      " |-- community_owned_date: timestamp (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      "\n",
      "Successfully written 177547 documents\n",
      "Wiping \"scifi\" collection\n",
      "Creating \"scifi\" collection\n",
      "Status: Success\n",
      "Loading data/scifi/posts.csv\n",
      "Schema: \n",
      "root\n",
      " |-- post_type_id: integer (nullable = true)\n",
      " |-- accepted_answer_id: integer (nullable = true)\n",
      " |-- parent_id: integer (nullable = true)\n",
      " |-- creation_date: timestamp (nullable = true)\n",
      " |-- deletion_date: string (nullable = true)\n",
      " |-- score: integer (nullable = true)\n",
      " |-- view_count: integer (nullable = true)\n",
      " |-- body: string (nullable = true)\n",
      " |-- owner_user_id: integer (nullable = true)\n",
      " |-- owner_display_name: string (nullable = true)\n",
      " |-- last_editor_user_id: integer (nullable = true)\n",
      " |-- last_editor_display_name: string (nullable = true)\n",
      " |-- last_edit_date: timestamp (nullable = true)\n",
      " |-- last_activity_date: timestamp (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- tags: string (nullable = true)\n",
      " |-- answer_count: integer (nullable = true)\n",
      " |-- comment_count: integer (nullable = true)\n",
      " |-- favorite_count: integer (nullable = true)\n",
      " |-- closed_date: timestamp (nullable = true)\n",
      " |-- community_owned_date: timestamp (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      "\n",
      "Successfully written 177547 documents\n",
      "Loading data/travel/posts.csv\n",
      "Schema: \n",
      "root\n",
      " |-- post_type_id: integer (nullable = true)\n",
      " |-- accepted_answer_id: integer (nullable = true)\n",
      " |-- parent_id: integer (nullable = true)\n",
      " |-- creation_date: timestamp (nullable = true)\n",
      " |-- deletion_date: string (nullable = true)\n",
      " |-- score: integer (nullable = true)\n",
      " |-- view_count: integer (nullable = true)\n",
      " |-- body: string (nullable = true)\n",
      " |-- owner_user_id: integer (nullable = true)\n",
      " |-- owner_display_name: string (nullable = true)\n",
      " |-- last_editor_user_id: integer (nullable = true)\n",
      " |-- last_editor_display_name: string (nullable = true)\n",
      " |-- last_edit_date: timestamp (nullable = true)\n",
      " |-- last_activity_date: timestamp (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- tags: string (nullable = true)\n",
      " |-- answer_count: integer (nullable = true)\n",
      " |-- comment_count: integer (nullable = true)\n",
      " |-- favorite_count: integer (nullable = true)\n",
      " |-- closed_date: timestamp (nullable = true)\n",
      " |-- community_owned_date: timestamp (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      "\n",
      "Successfully written 111130 documents\n",
      "Wiping \"travel\" collection\n",
      "Creating \"travel\" collection\n",
      "Status: Success\n",
      "Loading data/travel/posts.csv\n",
      "Schema: \n",
      "root\n",
      " |-- post_type_id: integer (nullable = true)\n",
      " |-- accepted_answer_id: integer (nullable = true)\n",
      " |-- parent_id: integer (nullable = true)\n",
      " |-- creation_date: timestamp (nullable = true)\n",
      " |-- deletion_date: string (nullable = true)\n",
      " |-- score: integer (nullable = true)\n",
      " |-- view_count: integer (nullable = true)\n",
      " |-- body: string (nullable = true)\n",
      " |-- owner_user_id: integer (nullable = true)\n",
      " |-- owner_display_name: string (nullable = true)\n",
      " |-- last_editor_user_id: integer (nullable = true)\n",
      " |-- last_editor_display_name: string (nullable = true)\n",
      " |-- last_edit_date: timestamp (nullable = true)\n",
      " |-- last_activity_date: timestamp (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- tags: string (nullable = true)\n",
      " |-- answer_count: integer (nullable = true)\n",
      " |-- comment_count: integer (nullable = true)\n",
      " |-- favorite_count: integer (nullable = true)\n",
      " |-- closed_date: timestamp (nullable = true)\n",
      " |-- community_owned_date: timestamp (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      "\n",
      "Successfully written 111130 documents\n",
      "Loading data/devops/posts.csv\n",
      "Schema: \n",
      "root\n",
      " |-- post_type_id: integer (nullable = true)\n",
      " |-- accepted_answer_id: integer (nullable = true)\n",
      " |-- parent_id: integer (nullable = true)\n",
      " |-- creation_date: timestamp (nullable = true)\n",
      " |-- deletion_date: string (nullable = true)\n",
      " |-- score: integer (nullable = true)\n",
      " |-- view_count: integer (nullable = true)\n",
      " |-- body: string (nullable = true)\n",
      " |-- owner_user_id: integer (nullable = true)\n",
      " |-- owner_display_name: string (nullable = true)\n",
      " |-- last_editor_user_id: integer (nullable = true)\n",
      " |-- last_editor_display_name: string (nullable = true)\n",
      " |-- last_edit_date: timestamp (nullable = true)\n",
      " |-- last_activity_date: timestamp (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- tags: string (nullable = true)\n",
      " |-- answer_count: integer (nullable = true)\n",
      " |-- comment_count: integer (nullable = true)\n",
      " |-- favorite_count: integer (nullable = true)\n",
      " |-- closed_date: timestamp (nullable = true)\n",
      " |-- community_owned_date: timestamp (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      "\n",
      "Successfully written 9216 documents\n",
      "Wiping \"devops\" collection\n",
      "Creating \"devops\" collection\n",
      "Status: Success\n",
      "Loading data/devops/posts.csv\n",
      "Schema: \n",
      "root\n",
      " |-- post_type_id: integer (nullable = true)\n",
      " |-- accepted_answer_id: integer (nullable = true)\n",
      " |-- parent_id: integer (nullable = true)\n",
      " |-- creation_date: timestamp (nullable = true)\n",
      " |-- deletion_date: string (nullable = true)\n",
      " |-- score: integer (nullable = true)\n",
      " |-- view_count: integer (nullable = true)\n",
      " |-- body: string (nullable = true)\n",
      " |-- owner_user_id: integer (nullable = true)\n",
      " |-- owner_display_name: string (nullable = true)\n",
      " |-- last_editor_user_id: integer (nullable = true)\n",
      " |-- last_editor_display_name: string (nullable = true)\n",
      " |-- last_edit_date: timestamp (nullable = true)\n",
      " |-- last_activity_date: timestamp (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- tags: string (nullable = true)\n",
      " |-- answer_count: integer (nullable = true)\n",
      " |-- comment_count: integer (nullable = true)\n",
      " |-- favorite_count: integer (nullable = true)\n",
      " |-- closed_date: timestamp (nullable = true)\n",
      " |-- community_owned_date: timestamp (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      "\n",
      "Successfully written 9216 documents\n"
     ]
    }
   ],
   "source": [
    "se_collection = engine.create_collection(\"stackexchange\")\n",
    "\n",
    "datasets = [\"health\", \"cooking\", \"scifi\", \"travel\", \"devops\"]\n",
    "for dataset in datasets:\n",
    "    file = f\"data/{dataset}/posts.csv\"\n",
    "    se_collection.write(from_csv(file, {\"category\": dataset}))\n",
    "    collection = engine.create_collection(dataset)\n",
    "    collection.write(from_csv(file, {\"category\": dataset}))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Dual index datasets into Solr for SKG"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wiping \"jobs\" collection\n",
      "Creating \"jobs\" collection\n",
      "Status: Success\n",
      "Loading data/jobs/jobs.csv\n",
      "Schema: \n",
      "root\n",
      " |-- job_title: string (nullable = true)\n",
      " |-- job_description: string (nullable = true)\n",
      " |-- job_type: string (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      " |-- job_location: string (nullable = true)\n",
      " |-- job_city: string (nullable = true)\n",
      " |-- job_state: string (nullable = true)\n",
      " |-- job_country: string (nullable = true)\n",
      " |-- job_zip_code: string (nullable = true)\n",
      " |-- job_address: string (nullable = true)\n",
      " |-- min_salary: string (nullable = true)\n",
      " |-- max_salary: string (nullable = true)\n",
      " |-- salary_period: string (nullable = true)\n",
      " |-- apply_url: string (nullable = true)\n",
      " |-- apply_email: string (nullable = true)\n",
      " |-- num_employees: string (nullable = true)\n",
      " |-- industry: string (nullable = true)\n",
      " |-- company_name: string (nullable = true)\n",
      " |-- company_email: string (nullable = true)\n",
      " |-- company_website: string (nullable = true)\n",
      " |-- company_phone: string (nullable = true)\n",
      " |-- company_logo: string (nullable = true)\n",
      " |-- company_description: string (nullable = true)\n",
      " |-- company_location: string (nullable = true)\n",
      " |-- company_city: string (nullable = true)\n",
      " |-- company_state: string (nullable = true)\n",
      " |-- company_country: string (nullable = true)\n",
      " |-- company_zip_code: string (nullable = true)\n",
      " |-- job_date: timestamp (nullable = true)\n",
      "\n",
      "Successfully written 30002 documents\n",
      "Wiping \"stackexchange\" collection\n",
      "Creating \"stackexchange\" collection\n",
      "Status: Success\n",
      "Loading data/health/posts.csv\n",
      "Schema: \n",
      "root\n",
      " |-- post_type_id: integer (nullable = true)\n",
      " |-- accepted_answer_id: integer (nullable = true)\n",
      " |-- parent_id: integer (nullable = true)\n",
      " |-- creation_date: timestamp (nullable = true)\n",
      " |-- deletion_date: string (nullable = true)\n",
      " |-- score: integer (nullable = true)\n",
      " |-- view_count: integer (nullable = true)\n",
      " |-- body: string (nullable = true)\n",
      " |-- owner_user_id: integer (nullable = true)\n",
      " |-- owner_display_name: string (nullable = true)\n",
      " |-- last_editor_user_id: integer (nullable = true)\n",
      " |-- last_editor_display_name: string (nullable = true)\n",
      " |-- last_edit_date: timestamp (nullable = true)\n",
      " |-- last_activity_date: timestamp (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- tags: string (nullable = true)\n",
      " |-- answer_count: integer (nullable = true)\n",
      " |-- comment_count: integer (nullable = true)\n",
      " |-- favorite_count: integer (nullable = true)\n",
      " |-- closed_date: timestamp (nullable = true)\n",
      " |-- community_owned_date: timestamp (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      "\n",
      "Successfully written 12892 documents\n",
      "Wiping \"health\" collection\n",
      "Creating \"health\" collection\n",
      "Status: Success\n",
      "Loading data/health/posts.csv\n",
      "Schema: \n",
      "root\n",
      " |-- post_type_id: integer (nullable = true)\n",
      " |-- accepted_answer_id: integer (nullable = true)\n",
      " |-- parent_id: integer (nullable = true)\n",
      " |-- creation_date: timestamp (nullable = true)\n",
      " |-- deletion_date: string (nullable = true)\n",
      " |-- score: integer (nullable = true)\n",
      " |-- view_count: integer (nullable = true)\n",
      " |-- body: string (nullable = true)\n",
      " |-- owner_user_id: integer (nullable = true)\n",
      " |-- owner_display_name: string (nullable = true)\n",
      " |-- last_editor_user_id: integer (nullable = true)\n",
      " |-- last_editor_display_name: string (nullable = true)\n",
      " |-- last_edit_date: timestamp (nullable = true)\n",
      " |-- last_activity_date: timestamp (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- tags: string (nullable = true)\n",
      " |-- answer_count: integer (nullable = true)\n",
      " |-- comment_count: integer (nullable = true)\n",
      " |-- favorite_count: integer (nullable = true)\n",
      " |-- closed_date: timestamp (nullable = true)\n",
      " |-- community_owned_date: timestamp (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      "\n",
      "Successfully written 12892 documents\n",
      "Loading data/cooking/posts.csv\n",
      "Schema: \n",
      "root\n",
      " |-- post_type_id: integer (nullable = true)\n",
      " |-- accepted_answer_id: integer (nullable = true)\n",
      " |-- parent_id: integer (nullable = true)\n",
      " |-- creation_date: timestamp (nullable = true)\n",
      " |-- deletion_date: string (nullable = true)\n",
      " |-- score: integer (nullable = true)\n",
      " |-- view_count: integer (nullable = true)\n",
      " |-- body: string (nullable = true)\n",
      " |-- owner_user_id: integer (nullable = true)\n",
      " |-- owner_display_name: string (nullable = true)\n",
      " |-- last_editor_user_id: integer (nullable = true)\n",
      " |-- last_editor_display_name: string (nullable = true)\n",
      " |-- last_edit_date: timestamp (nullable = true)\n",
      " |-- last_activity_date: timestamp (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- tags: string (nullable = true)\n",
      " |-- answer_count: integer (nullable = true)\n",
      " |-- comment_count: integer (nullable = true)\n",
      " |-- favorite_count: integer (nullable = true)\n",
      " |-- closed_date: timestamp (nullable = true)\n",
      " |-- community_owned_date: timestamp (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      "\n",
      "Successfully written 79324 documents\n",
      "Wiping \"cooking\" collection\n",
      "Creating \"cooking\" collection\n",
      "Status: Success\n",
      "Loading data/cooking/posts.csv\n",
      "Schema: \n",
      "root\n",
      " |-- post_type_id: integer (nullable = true)\n",
      " |-- accepted_answer_id: integer (nullable = true)\n",
      " |-- parent_id: integer (nullable = true)\n",
      " |-- creation_date: timestamp (nullable = true)\n",
      " |-- deletion_date: string (nullable = true)\n",
      " |-- score: integer (nullable = true)\n",
      " |-- view_count: integer (nullable = true)\n",
      " |-- body: string (nullable = true)\n",
      " |-- owner_user_id: integer (nullable = true)\n",
      " |-- owner_display_name: string (nullable = true)\n",
      " |-- last_editor_user_id: integer (nullable = true)\n",
      " |-- last_editor_display_name: string (nullable = true)\n",
      " |-- last_edit_date: timestamp (nullable = true)\n",
      " |-- last_activity_date: timestamp (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- tags: string (nullable = true)\n",
      " |-- answer_count: integer (nullable = true)\n",
      " |-- comment_count: integer (nullable = true)\n",
      " |-- favorite_count: integer (nullable = true)\n",
      " |-- closed_date: timestamp (nullable = true)\n",
      " |-- community_owned_date: timestamp (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      "\n",
      "Successfully written 79324 documents\n",
      "Loading data/scifi/posts.csv\n",
      "Schema: \n",
      "root\n",
      " |-- post_type_id: integer (nullable = true)\n",
      " |-- accepted_answer_id: integer (nullable = true)\n",
      " |-- parent_id: integer (nullable = true)\n",
      " |-- creation_date: timestamp (nullable = true)\n",
      " |-- deletion_date: string (nullable = true)\n",
      " |-- score: integer (nullable = true)\n",
      " |-- view_count: integer (nullable = true)\n",
      " |-- body: string (nullable = true)\n",
      " |-- owner_user_id: integer (nullable = true)\n",
      " |-- owner_display_name: string (nullable = true)\n",
      " |-- last_editor_user_id: integer (nullable = true)\n",
      " |-- last_editor_display_name: string (nullable = true)\n",
      " |-- last_edit_date: timestamp (nullable = true)\n",
      " |-- last_activity_date: timestamp (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- tags: string (nullable = true)\n",
      " |-- answer_count: integer (nullable = true)\n",
      " |-- comment_count: integer (nullable = true)\n",
      " |-- favorite_count: integer (nullable = true)\n",
      " |-- closed_date: timestamp (nullable = true)\n",
      " |-- community_owned_date: timestamp (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      "\n",
      "Successfully written 177547 documents\n",
      "Wiping \"scifi\" collection\n",
      "Creating \"scifi\" collection\n",
      "Status: Success\n",
      "Loading data/scifi/posts.csv\n",
      "Schema: \n",
      "root\n",
      " |-- post_type_id: integer (nullable = true)\n",
      " |-- accepted_answer_id: integer (nullable = true)\n",
      " |-- parent_id: integer (nullable = true)\n",
      " |-- creation_date: timestamp (nullable = true)\n",
      " |-- deletion_date: string (nullable = true)\n",
      " |-- score: integer (nullable = true)\n",
      " |-- view_count: integer (nullable = true)\n",
      " |-- body: string (nullable = true)\n",
      " |-- owner_user_id: integer (nullable = true)\n",
      " |-- owner_display_name: string (nullable = true)\n",
      " |-- last_editor_user_id: integer (nullable = true)\n",
      " |-- last_editor_display_name: string (nullable = true)\n",
      " |-- last_edit_date: timestamp (nullable = true)\n",
      " |-- last_activity_date: timestamp (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- tags: string (nullable = true)\n",
      " |-- answer_count: integer (nullable = true)\n",
      " |-- comment_count: integer (nullable = true)\n",
      " |-- favorite_count: integer (nullable = true)\n",
      " |-- closed_date: timestamp (nullable = true)\n",
      " |-- community_owned_date: timestamp (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      "\n",
      "Successfully written 177547 documents\n",
      "Loading data/travel/posts.csv\n",
      "Schema: \n",
      "root\n",
      " |-- post_type_id: integer (nullable = true)\n",
      " |-- accepted_answer_id: integer (nullable = true)\n",
      " |-- parent_id: integer (nullable = true)\n",
      " |-- creation_date: timestamp (nullable = true)\n",
      " |-- deletion_date: string (nullable = true)\n",
      " |-- score: integer (nullable = true)\n",
      " |-- view_count: integer (nullable = true)\n",
      " |-- body: string (nullable = true)\n",
      " |-- owner_user_id: integer (nullable = true)\n",
      " |-- owner_display_name: string (nullable = true)\n",
      " |-- last_editor_user_id: integer (nullable = true)\n",
      " |-- last_editor_display_name: string (nullable = true)\n",
      " |-- last_edit_date: timestamp (nullable = true)\n",
      " |-- last_activity_date: timestamp (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- tags: string (nullable = true)\n",
      " |-- answer_count: integer (nullable = true)\n",
      " |-- comment_count: integer (nullable = true)\n",
      " |-- favorite_count: integer (nullable = true)\n",
      " |-- closed_date: timestamp (nullable = true)\n",
      " |-- community_owned_date: timestamp (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      "\n",
      "Successfully written 111130 documents\n",
      "Wiping \"travel\" collection\n",
      "Creating \"travel\" collection\n",
      "Status: Success\n",
      "Loading data/travel/posts.csv\n",
      "Schema: \n",
      "root\n",
      " |-- post_type_id: integer (nullable = true)\n",
      " |-- accepted_answer_id: integer (nullable = true)\n",
      " |-- parent_id: integer (nullable = true)\n",
      " |-- creation_date: timestamp (nullable = true)\n",
      " |-- deletion_date: string (nullable = true)\n",
      " |-- score: integer (nullable = true)\n",
      " |-- view_count: integer (nullable = true)\n",
      " |-- body: string (nullable = true)\n",
      " |-- owner_user_id: integer (nullable = true)\n",
      " |-- owner_display_name: string (nullable = true)\n",
      " |-- last_editor_user_id: integer (nullable = true)\n",
      " |-- last_editor_display_name: string (nullable = true)\n",
      " |-- last_edit_date: timestamp (nullable = true)\n",
      " |-- last_activity_date: timestamp (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- tags: string (nullable = true)\n",
      " |-- answer_count: integer (nullable = true)\n",
      " |-- comment_count: integer (nullable = true)\n",
      " |-- favorite_count: integer (nullable = true)\n",
      " |-- closed_date: timestamp (nullable = true)\n",
      " |-- community_owned_date: timestamp (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      "\n",
      "Successfully written 111130 documents\n",
      "Loading data/devops/posts.csv\n",
      "Schema: \n",
      "root\n",
      " |-- post_type_id: integer (nullable = true)\n",
      " |-- accepted_answer_id: integer (nullable = true)\n",
      " |-- parent_id: integer (nullable = true)\n",
      " |-- creation_date: timestamp (nullable = true)\n",
      " |-- deletion_date: string (nullable = true)\n",
      " |-- score: integer (nullable = true)\n",
      " |-- view_count: integer (nullable = true)\n",
      " |-- body: string (nullable = true)\n",
      " |-- owner_user_id: integer (nullable = true)\n",
      " |-- owner_display_name: string (nullable = true)\n",
      " |-- last_editor_user_id: integer (nullable = true)\n",
      " |-- last_editor_display_name: string (nullable = true)\n",
      " |-- last_edit_date: timestamp (nullable = true)\n",
      " |-- last_activity_date: timestamp (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- tags: string (nullable = true)\n",
      " |-- answer_count: integer (nullable = true)\n",
      " |-- comment_count: integer (nullable = true)\n",
      " |-- favorite_count: integer (nullable = true)\n",
      " |-- closed_date: timestamp (nullable = true)\n",
      " |-- community_owned_date: timestamp (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      "\n",
      "Successfully written 9216 documents\n",
      "Wiping \"devops\" collection\n",
      "Creating \"devops\" collection\n",
      "Status: Success\n",
      "Loading data/devops/posts.csv\n",
      "Schema: \n",
      "root\n",
      " |-- post_type_id: integer (nullable = true)\n",
      " |-- accepted_answer_id: integer (nullable = true)\n",
      " |-- parent_id: integer (nullable = true)\n",
      " |-- creation_date: timestamp (nullable = true)\n",
      " |-- deletion_date: string (nullable = true)\n",
      " |-- score: integer (nullable = true)\n",
      " |-- view_count: integer (nullable = true)\n",
      " |-- body: string (nullable = true)\n",
      " |-- owner_user_id: integer (nullable = true)\n",
      " |-- owner_display_name: string (nullable = true)\n",
      " |-- last_editor_user_id: integer (nullable = true)\n",
      " |-- last_editor_display_name: string (nullable = true)\n",
      " |-- last_edit_date: timestamp (nullable = true)\n",
      " |-- last_activity_date: timestamp (nullable = true)\n",
      " |-- title: string (nullable = true)\n",
      " |-- tags: string (nullable = true)\n",
      " |-- answer_count: integer (nullable = true)\n",
      " |-- comment_count: integer (nullable = true)\n",
      " |-- favorite_count: integer (nullable = true)\n",
      " |-- closed_date: timestamp (nullable = true)\n",
      " |-- community_owned_date: timestamp (nullable = true)\n",
      " |-- category: string (nullable = false)\n",
      "\n",
      "Successfully written 9216 documents\n"
     ]
    }
   ],
   "source": [
    "jobs_collection = get_engine(\"solr\").create_collection(\"jobs\")\n",
    "jobs_collection.write(from_csv(\"data/jobs/jobs.csv\", {\"category\": jobs_collection.name}))\n",
    "\n",
    "se_collection = get_engine(\"solr\").create_collection(\"stackexchange\")\n",
    "\n",
    "datasets = [\"health\", \"cooking\", \"scifi\", \"travel\", \"devops\"]\n",
    "for dataset in datasets:\n",
    "    file = f\"data/{dataset}/posts.csv\"\n",
    "    se_collection.write(from_csv(file, {\"category\": dataset}))\n",
    "    collection = get_engine(\"solr\").create_collection(dataset)\n",
    "    collection.write(from_csv(file, {\"category\": dataset}))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Success!\n",
    "\n",
    "Now that you've indexed several large text datasets, in the next notebook we will explore the rich graph of semantic relationships embedded within those documents by leveraging Semantic Knowledge Graphs for real-time traversal and ranking of arbitrary relationships within the domains of our datasets.\n",
    "\n",
    "Up next: [Working with Semantic Knowledge Graphs](3.semantic-knowledge-graph.ipynb)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
